#coding:utf-8
import networkx as nx
from networkx.algorithms import bipartite
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import os
import cn2an
import chinese2digits as c2d
import operator
import plotly.express as px
import plotly
import seaborn
import plotly.graph_objects as go
import plotly as py
import plotly.offline as offline
from matplotlib import colors
offline.init_notebook_mode(connected=True)
from urllib.request import urlopen
import json
import requests
from plotly.subplots import make_subplots
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas
from pysal.explore import esda # Exploratory Spatial analytics
from pysal.lib import weights
import pysal
import contextily
from ipynb.fs.full.case_to_graph_only_method import convert_csv_graph, provinces_map
mapbox_access_token = "pk.eyJ1Ijoic3RhcmljZSIsImEiOiJjazN6Y2s5dTUxY2R6M2xxcHllbXk4YWFzIn0.lActFqLzqRWGn7dqr4BShw"
px.set_mapbox_access_token(mapbox_access_token)
prov_lads = geopandas.read_file("/Users/starice/OwnFiles/cityu/RA/code/case_process/china_province.geojson")
prov_lads.info()
<class 'geopandas.geodataframe.GeoDataFrame'> RangeIndex: 33 entries, 0 to 32 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 NL_NAME_1 33 non-null object 1 ENGTYPE_1 33 non-null object 2 geometry 33 non-null geometry dtypes: geometry(1), object(2) memory usage: 920.0+ bytes
base_url = "/Users/starice/Desktop/total_extracted_result/"
pre_dir = ['type1', 'type2', 'type3', 'type4']
dir_name = ['2014', '2015', '2016', '2017', '2018', '2019', '2020']
dir_sname = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
# csv_graph, _new_csvpd = convert_csv_graph(pre_dir[:], dir_name[:], dir_sname[:])
csv_graph = nx.read_gpickle("/Users/starice/Desktop/csv_graph.gpickle")
# basic methods of this bipartite graph
# csv_graph['5924d05ce13823b54caac225']['王瑜']
csv_graph.edges('58113ef92f12c63192b55835')
# csv_graph.degree['周开礼']
# csv_graph.get_edge_data('张丙刚', '5ec92ef13cdef0087ea6a1cc')
EdgeDataView([('58113ef92f12c63192b55835', '黄海旺'), ('58113ef92f12c63192b55835', '陕西秦食实业有限公司'), ('58113ef92f12c63192b55835', '李军民'), ('58113ef92f12c63192b55835', '宋军庆')])
#输出所有的原告title
titles = nx.get_edge_attributes(csv_graph, "title").values()
with open("/Users/starice/Desktop/party_titles.txt", 'w') as f:
for i in list(set(titles)):
f.write(str(i))
f.write("\n")
# list(set(titles))
plaintiff_titles = ['上诉人', '上诉人(一审原告)', '上诉人(一审第三人)', '上诉人(原告)', '上诉人(原审原告)', '上诉人(原审原告、反诉被告)',
'上诉人(原审原告人)', '上诉人(原审第三人)', '公益诉讼起诉人', '再审申请人', '再审申请人(一审原告)',
'再审申请人(一审原告、二审上诉人)',
'再审申请人(一审原告、二审被上诉人)', '再审申请人(原审原告)',
'再审申请人(原审原告、二审上诉人)', '再审申请人:(一审第三人、二审上诉人)', '原告',
'原告(反诉被告)', '抗诉机关', '支持起诉人',
'支持起诉机关', '申诉人(一审原告、二审上诉人)',
'申诉人(一审原告、二审上诉人、再审申请人)',
'申诉人(一审原告、二审上诉人、原再审申请人)',
'申诉人(一审原告、二审被上诉人)',
'申诉人(原审原告)',
'申请再审人(一审原告、二审上诉人)', '被上诉人(一审原告)', '被上诉人(原审原告)',
'被上诉人(原审原告、反诉被告)', '被上诉人(原甲原告)',
'被上诉人一(原审原告)',
'被上诉人一(原审被告一)',
'被上诉人二(原审被告二)', '被申请人(一审原告、二审上诉人)',
'被申请人(一审原告、二审被上诉人)', '被申请人(原审原告)']
defendant_titles = ['(一审被告、二审被上诉人)', '一审被告', '一审被告(二审上诉人)', '一审被告、二审被上诉人', '一审被告二审上诉人)',
'上上诉人(原审被告)', '上诉人(一审被告)', '上诉人(原审第一被告)', '上诉人(原审被告)',
'上诉人(原审被告、反诉原告)',
'上诉人(原审被告一)', '上诉人(被告)', '公益诉讼出庭人', '再审申请人(一审被告)',
'再审申请人(一审被告、二审上诉人)',
'再审申请人(一审被告、二审被上诉人)', '再审申请人(再审被告)', '再审申请人(原审被告)', '原审当事人(原审被告)',
'原审第三被告',
'原审第二被告',
'原审被告',
'原审被告(反诉原告)', '特别授权被告', '申请再审人(原审被告)',
'第一被告', '第三被告',
'第二被告',
'被上诉人',
'被上诉人(一审被告)', '被上诉人(原审第三人)',
'被上诉人(原审被告)',
'被上诉人(原审被告、反诉原告)',
'被上诉人(原审被告人)', '被告',
'被告(反诉原告)',
'被告一',
'被告二', '被申诉人(一审被告,二审被上诉人)',
'被申诉人(一审被告、二审上诉人)',
'被申诉人(一审被告、二审被上诉人)',
'被申诉人(一审被告、二审被上诉人、再审被申请人)',
'被申诉人(一审被告、二审被上诉人、原再审被申请人)',
'被申诉人(原审被告)',
'被申请人', '被申请人(一审被告)',
'被申请人(一审被告,二审被上诉人)',
'被申请人(一审被告、二审上诉人)',
'被申请人(一审被告、二审被上诉人)', '被申请人(原审被告)', '被申请人(原审被告、二审被上诉人)']
# plaintiff_titles = ['原告', '上诉人(原审原告)', '上诉人(一审原告)', '上诉人', \
# '上诉人(原审被告、反诉原告)', '再审申请人(一审原告、二审上诉人)', '被申请人(一审原告、二审被上诉人)', \
# '再审申请人(一审原告、二审上诉人)', '申诉人(一审原告、二审上诉人)', '被告(反诉原告)', '被申请人(原审原告)', \
# '申诉人(一审原告、二审上诉人、原再审申请人)', '被上诉人(原审原告)', '被上诉人(原审原告、反诉被告)', '再审申请人(原审原告)', \
# '申诉人(一审原告、二审上诉人、再审申请人)', '再审申请人', \
# '申诉人(原审原告)', '公益诉讼起诉人', '诉讼代表人', '上诉人(原审原告、反诉被告)', '再审申请人(一审原告)', \
# '上诉人(原审原告人)', '被申请人(一审原告、二审上诉人)', '再审申请人:(一审第三人、二审上诉人)', \
# '申请再审人(一审原告、二审上诉人)', '上诉人(原告)', '原审当事人(原审被告)']
# defendant_titles = ['被上诉人一(原审被告一)', '被上诉人二(原审被告二)', '被申诉人(一审被告、二审被上诉人、再审被申请人)', \
# '被申诉人(原审被告)', '被申请人(一审被告)', '被申请人', \
# '被申诉人(一审被告、二审上诉人)', '原审第二被告', '原审第三被告', '被申请人(一审被告、二审被上诉人)', \
# '第三被告', '一审被告', '被申请人(一审被告、二审上诉人)', '被申请人(原审被告)', '被告', \
# '被上诉人(原审被告)', '原审被告', '被上诉人(一审被告)', '第一被告', '第二被告', '被告一', \
# '被告二', '被上诉人(一审原告)', '被上诉人(原审第三人)', '被上诉人', \
# '被上诉人(原审被告、反诉原告)', '被上诉人(原审被告人)', '上诉人(原审被告)', '上诉人(一审被告)', \
# '上诉人(原审第一被告)', '申请再审人(原审被告)', '再审申请人(原审被告)', \
# '上上诉人(原审被告)', '再审申请人(一审被告、二审上诉人)', '原告(反诉被告)', '再审申请人(一审被告)', \
# '再审申请人(一审被告、二审被上诉人)', '一审被告、二审被上诉人', '被申请人(一审被告,二审被上诉人)', \
# '被申诉人(一审被告、二审被上诉人)', '被申诉人(一审被告、二审被上诉人、原再审被申请人)', '上诉人(被告)']
cases = [(n, d) for n, d in csv_graph.nodes(data=True) \
if d['bipartite']==0]
plaintiffs = [(e1, e2, d) for e1, e2, d in csv_graph.edges(nbunch=[n[0] for n in cases], data=True) \
if d['title'] in plaintiff_titles]
defendants = [(e1, e2, d) for e1, e2, d in csv_graph.edges(nbunch=[n[0] for n in cases], data=True) \
if d['title'] in defendant_titles]
pd_plaintiffs = pd.DataFrame({"case_id": [i[0] for i in plaintiffs], "plaintiff": [i[1] for i in plaintiffs]})
pd_defendants = pd.DataFrame(({"case_id": [i[0] for i in defendants], "defendant": [i[1] for i in defendants]}))
pd_cases = pd.DataFrame({"case_id": [i[0] for i in cases], \
"judgement_date": [i[1]['judgement_date'] for i in cases], \
"is_success": [i[1]['is_success'] for i in cases], \
# "lat": [i[1]['lat'] for i in cases], \
# "lon": [i[1]['lon'] for i in cases], \
"court_name": [i[1]['court_name'] for i in cases], \
"procedure": [i[1]['procedure'] for i in cases], \
"judge": [i[1]['judge'] for i in cases], \
"legalfee": [i[1]['legalfee'] for i in cases], \
"objectmoney": [i[1]['objectmoney'] for i in cases], \
"province": [i[1]['province'] for i in cases], \
"city": [i[1]['city'] for i in cases], \
"reason": [i[1]['reason'] for i in cases], \
"district": [i[1]['district'] for i in cases], \
"penalty": [i[1]['penalty'] for i in cases]})
pd_cases['year'] = pd.DatetimeIndex(pd_cases['judgement_date']).year
pd_cases['month'] = pd.DatetimeIndex(pd_cases['judgement_date']).month
pd_cases['day'] = pd.DatetimeIndex(pd_cases['judgement_date']).day
all_cases = pd_cases.merge(pd_plaintiffs, on="case_id", how="left")
all_cases = all_cases.merge(pd_defendants, on="case_id", how="left")
len(cases), all_cases['case_id'].nunique()
(39257, 39257)
plaintiff_degrees = csv_graph.degree(set([i[1] for i in plaintiffs]))
df_plaintiff_degrees = pd.DataFrame({"plaintiff": dict(plaintiff_degrees).keys(), \
"degree": dict(plaintiff_degrees).values()})
df_plaintiff_degrees = df_plaintiff_degrees[df_plaintiff_degrees['plaintiff'] != '']
df_plaintiff_degrees = df_plaintiff_degrees.sort_values(by="degree", ascending=False)
print("The total number of plaintiffs is: ", len(df_plaintiff_degrees['plaintiff']))
print("The total number of repeat plaintiffs is: ", len(df_plaintiff_degrees[df_plaintiff_degrees['degree'] > 1]))
df_plaintiff_degrees.describe()
The total number of plaintiffs is: 5662 The total number of repeat plaintiffs is: 3305
| degree | |
|---|---|
| count | 5662.000000 |
| mean | 8.422112 |
| std | 29.061245 |
| min | 1.000000 |
| 25% | 1.000000 |
| 50% | 2.000000 |
| 75% | 5.000000 |
| max | 609.000000 |
defendant_degrees = csv_graph.degree(set([i[1] for i in defendants]))
df_defendant_degrees = pd.DataFrame({"defendant": dict(defendant_degrees).keys(), \
"degree": dict(defendant_degrees).values()})
df_defendant_degrees = df_defendant_degrees[df_defendant_degrees['defendant'] != '']
df_defendant_degrees = df_defendant_degrees.sort_values(by="degree", ascending=False)
print("The total number of defendants is: ", len(set([i[1] for i in defendants])))
print("The total number of repeat defendants is: ", len(df_defendant_degrees[df_defendant_degrees['degree'] > 1]))
df_defendant_degrees.describe()
The total number of defendants is: 15726 The total number of repeat defendants is: 5343
| degree | |
|---|---|
| count | 15629.000000 |
| mean | 3.560816 |
| std | 25.486427 |
| min | 1.000000 |
| 25% | 1.000000 |
| 50% | 1.000000 |
| 75% | 2.000000 |
| max | 2407.000000 |
# 看一下异常degree 的 defendant
# 名称正常
df_defendant_degrees[df_defendant_degrees['degree'] > 1000]
| defendant | degree | |
|---|---|---|
| 12126 | 天津市人人乐商业有限公司 | 2407 |
| 10148 | 重庆永辉超市有限公司 | 1090 |
# 将原告和被告的histogram结合到一张图上展示
# 可以看出两个变量的分布均是明显的右偏态分布
fig = go.Figure(
go.Histogram(
x = df_plaintiff_degrees[df_plaintiff_degrees['degree'] > 1]['degree'],
bingroup = 1,
name = "plaintiff_degree"
)
)
fig.add_trace(
go.Histogram(
x = df_defendant_degrees[df_defendant_degrees['degree'] > 1]['degree'],
bingroup = 1,
name = "defendant_degree"
)
)
fig.update_layout(
barmode = "relative",
bargap = 0.2,
title = "Degree Distribution of All Parties Degrees"
)
fig.update_yaxes(type="log")
fig.show()
pd_1stcases = all_cases[all_cases['procedure']=="一审"]
len(pd_1stcases['case_id'].drop_duplicates())
32445
time_pd_1stcases = pd_1stcases.groupby("year")['case_id'].nunique().reset_index()
time_pd_1stcases.rename(columns={"case_id": "case_count"}, inplace=True)
fig = px.bar(
time_pd_1stcases,
x = "year",
y = "case_count",
title = "Temporal Distribution of All First Cases"
)
fig.show()
location_pd_1stcases = pd_1stcases.groupby("province")['case_id'].count().reset_index()
location_pd_1stcases.rename(columns={"case_id": "case_count"}, inplace=True)
geo_fig = px.choropleth_mapbox(
data_frame = location_pd_1stcases,
geojson = provinces_map,
color = np.log10(location_pd_1stcases['case_count']),
locations = "province",
featureidkey = "properties.NL_NAME_1",
color_continuous_scale = px.colors.sequential.Magenta,
center = {"lat": 37.110573, "lon": 106.493924},
hover_data = ['case_count'],
zoom = 3,
title = "Geographical Distribution of All First Cases"
)
geo_fig.update_layout(height = 800)
# offline.iplot(geo_fig)
geo_fig.show()
tl_pd_1stcases = pd_1stcases.groupby(['year', 'province'])['case_id'].nunique().reset_index()
tl_pd_1stcases.rename(columns={'case_id': 'case_count'}, inplace=True)
# geo_fig = px.choropleth_mapbox(
# data_frame = tl_pd_1stcases,
# geojson = provinces_map,
# color = np.log10(tl_pd_1stcases['case_count']),
# locations = "province",
# featureidkey = "properties.NL_NAME_1",
# color_continuous_scale = px.colors.sequential.Magenta,
# center = {"lat": 37.110573, "lon": 106.493924},
# animation_frame = "year",
# hover_data = ['case_count'],
# zoom = 3,
# title = "Temporal & Geographical Distribution of All First Cases"
# )
# geo_fig.update_layout(height = 800)
# geo_fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
# offline.iplot(geo_fig)
all_cases.head()
| case_id | judgement_date | is_success | court_name | procedure | judge | legalfee | objectmoney | province | city | reason | district | penalty | year | month | day | plaintiff | defendant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 57ab9058c2265c28a560195d | 2014-01-13 | TRUE | 杭州市西湖区人民法院 | 一审 | 萧京 | 244.0 | 374.48 | 浙江省 | 杭州市 | 买卖合同纠纷 | 西湖区 | 3744.8 | 2014 | 1 | 13 | 喻忠 | 广东粤北行健康食品有限公司 |
| 1 | 57ab9058c2265c28a560195d | 2014-01-13 | TRUE | 杭州市西湖区人民法院 | 一审 | 萧京 | 244.0 | 374.48 | 浙江省 | 杭州市 | 买卖合同纠纷 | 西湖区 | 3744.8 | 2014 | 1 | 13 | 喻忠 | 浙江淘宝网络有限公司 |
| 2 | 57baba28c2265c5f452d2cef | 2014-01-27 | FALSE | 安康市汉滨区人民法院 | 一审 | 张延安 | 50.0 | 0.00 | 陕西省 | 安康市 | 买卖合同纠纷 | 汉滨区 | 0.0 | 2014 | 1 | 27 | 骆永明 | 冯坤 |
| 3 | 57abb74ac2265c258984430a | 2014-01-28 | TRUE | 嘉善县人民法院 | 一审 | 吕学强 | 25.0 | 450.00 | 浙江省 | 嘉兴市 | 民事案由 | 嘉善县 | 450.0 | 2014 | 1 | 28 | 白世桥 | 嘉善大润发商业有限公司 |
| 4 | 57ad1fe7c2265c04d1192a41 | 2014-01-10 | TRUE | 重庆市沙坪坝区人民法院 | 一审 | 何思静 | 80.0 | 1312.00 | 重庆市 | 重庆市 | 产品责任纠纷 | 沙坪坝区 | 6560.0 | 2014 | 1 | 10 | 周巧 | 重庆和平药房连锁有限责任公司沙坪坝药妆店 |
pd_2edcases = all_cases[all_cases['procedure']=="二审"]
len(pd_2edcases['case_id'].drop_duplicates())
6646
print("The success rate of the second cases: ", len(pd_2edcases[pd_2edcases['is_success']=="TRUE"])/len(pd_2edcases))
The success rate of the second cases: 0.2991278130720362
time_pd_2edcases = pd_2edcases.groupby("year")['case_id'].nunique().reset_index()
time_pd_2edcases.rename(columns={"case_id": "case_count"}, inplace=True)
fig = px.bar(
time_pd_2edcases,
x = "year",
y = "case_count",
title = "Temporal Distribution of All Second Cases"
)
fig.show()
location_pd_2edcases = pd_2edcases.groupby("province")['case_id'].nunique().reset_index()
location_pd_2edcases.rename(columns={"case_id": "case_count"}, inplace=True)
geo_fig = px.choropleth_mapbox(
data_frame = location_pd_2edcases,
geojson = provinces_map,
color = np.log10(location_pd_2edcases['case_count']),
locations = "province",
featureidkey = "properties.NL_NAME_1",
color_continuous_scale = px.colors.sequential.Magenta,
center = {"lat": 37.110573, "lon": 106.493924},
hover_data = ['case_count'],
zoom = 3,
title = "Geographical Distribution of All Second Cases"
)
geo_fig.update_layout(height = 800)
offline.iplot(geo_fig)
tl_pd_2edcases = pd_2edcases.groupby(['year', 'province'])['case_id'].nunique().reset_index()
tl_pd_2edcases.rename(columns={'case_id': 'case_count'}, inplace=True)
# geo_fig = px.choropleth_mapbox(
# data_frame = tl_pd_2edcases,
# geojson = provinces_map,
# color = np.log10(tl_pd_2edcases['case_count']),
# locations = "province",
# featureidkey = "properties.NL_NAME_1",
# color_continuous_scale = px.colors.sequential.Magenta,
# center = {"lat": 37.110573, "lon": 106.493924},
# animation_frame = "year",
# hover_data = ['case_count'],
# zoom = 3,
# title = "Temporal & Geographical Distribution of All Second Cases"
# )
# geo_fig.update_layout(height = 800)
# geo_fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
# offline.iplot(geo_fig)
pd_cases = all_cases
tp_pd_cases = pd_cases.groupby(['year', 'procedure'])['case_id'].nunique().reset_index()
tp_pd_cases.rename(columns={"case_id": "case_count"}, inplace=True)
fig = px.bar(
tp_pd_cases,
x = "year",
y = "case_count",
color = "procedure",
title = "Temporal Distribution of Cases with Different Procedure"
)
fig.update_layout(barmode='group')
fig.show()
tpl_pd_cases = pd_cases.groupby(['year', 'province', 'procedure'])['case_id'].count().reset_index()
tpl_pd_cases.rename(columns={"case_id": "case_count"}, inplace=True)
data = []
layout = dict(
title = 'Geographical Distribution of All Case Procedures',
autosize = False,
width = 1200,
height = 400,
)
procedures = tpl_pd_cases['procedure'].unique()
for i in range(len(procedures)):
geo_key = 'geo'+str(i+1) if i != 0 else 'geo'
data.append(
dict(
type = 'choropleth',
locations = tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[i]]['province'],
hovertext = tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[i]]['case_count'],
hovertemplate = 'case count: %{hovertext}<br>province: %{location}',
geo = geo_key,
geojson = provinces_map,
name = procedures[i],
featureidkey = "properties.NL_NAME_1",
z = np.log10(tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[i]]['case_count']),
coloraxis = "coloraxis",
)
)
layout[geo_key] = dict(
scope = "asia",
showland = True,
domain = dict(x = [], y = []),
center = {"lat": 37.110573, "lon": 106.493924},
)
z = 0
COLS = 3
ROWS = 1
for y in reversed(range(ROWS)):
for x in range(COLS):
geo_key = 'geo'+str(z+1) if z != 0 else 'geo'
layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
z=z+1
if z > 3:
break
fig = go.Figure(data=data, layout=layout)
# fig.update_layout(layout = layout)
fig.show()
# geo_fig_p1 = px.choropleth_mapbox(
# data_frame = tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[0]],
# geojson = provinces_map,
# color = np.log10(tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[0]]['case_count']),
# locations = "province",
# featureidkey = "properties.NL_NAME_1",
# color_continuous_scale = px.colors.sequential.Magenta,
# center = {"lat": 37.110573, "lon": 106.493924},
# animation_frame = "year",
# hover_data = ['case_count'],
# zoom = 3,
# title = "Temporal & Geographical Distribution of All First Cases"
# )
# geo_fig_p2 = px.choropleth_mapbox(
# data_frame = tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[1]],
# geojson = provinces_map,
# color = np.log10(tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[1]]['case_count']),
# locations = "province",
# featureidkey = "properties.NL_NAME_1",
# color_continuous_scale = px.colors.sequential.Magenta,
# center = {"lat": 37.110573, "lon": 106.493924},
# animation_frame = "year",
# hover_data = ['case_count'],
# zoom = 3,
# title = "Temporal & Geographical Distribution of All Second Cases"
# )
# geo_fig_p3 = px.choropleth_mapbox(
# data_frame = tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[2]],
# geojson = provinces_map,
# color = np.log10(tpl_pd_cases[tpl_pd_cases['procedure'] == procedures[2]]['case_count']),
# locations = "province",
# featureidkey = "properties.NL_NAME_1",
# color_continuous_scale = px.colors.sequential.Magenta,
# center = {"lat": 37.110573, "lon": 106.493924},
# animation_frame = "year",
# hover_data = ['case_count'],
# zoom = 3,
# title = "Temporal & Geographical Distribution of All Retrial Cases"
# )
# geo_fig_p1.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
# geo_fig_p2.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
# geo_fig_p3.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
# geo_fig_p1.show()
# geo_fig_p2.show()
# geo_fig_p3.show()
time_pd_penalty = pd_cases[pd_cases['is_success']=="TRUE"].replace(0, pd.np.nan).dropna(subset=['penalty']).sort_values(by='year')
<ipython-input-104-c60d2c400b8d>:1: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead
pd_cases[pd_cases['penalty']<=1]
# 存在案件赔偿金为1元和0元,例如5a6212d4e138238c044893c6, 57ac901ec2265c28a56a8c9a
| case_id | judgement_date | is_success | court_name | procedure | judge | legalfee | objectmoney | province | city | reason | district | penalty | year | month | day | plaintiff | defendant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 57baba28c2265c5f452d2cef | 2014-01-27 | FALSE | 安康市汉滨区人民法院 | 一审 | 张延安 | 50.0 | 0.00 | 陕西省 | 安康市 | 买卖合同纠纷 | 汉滨区 | 0.0 | 2014 | 1 | 27 | 骆永明 | 冯坤 |
| 6 | 57baba99c2265c5f452d3090 | 2014-01-27 | FALSE | 安康市汉滨区人民法院 | 一审 | 张延安 | 50.0 | 0.00 | 陕西省 | 安康市 | 买卖合同纠纷 | 汉滨区 | 0.0 | 2014 | 1 | 27 | 乔小锋 | 古春林 |
| 9 | 57baba8fc2265c5f452d303f | 2014-01-27 | FALSE | 安康市汉滨区人民法院 | 一审 | 张延安 | 50.0 | 0.00 | 陕西省 | 安康市 | 买卖合同纠纷 | 汉滨区 | 0.0 | 2014 | 1 | 27 | 骆永明 | 王继兴 |
| 10 | 57ac04bbc2265c04d10dbb22 | 2014-01-27 | FALSE | 厦门市思明区人民法院 | 一审 | 邱瑛 | 106.0 | 0.00 | 福建省 | 厦门市 | 买卖合同纠纷 | 思明区 | 0.0 | 2014 | 1 | 27 | 张铁牛 | 厦门润瑞商业有限公司 |
| 11 | 57ac901ec2265c28a56a8c9a | 2014-01-22 | TRUE | 郑州市管城回族区人民法院 | 一审 | 时满良 | 50.0 | 22.40 | 河南省 | 郑州市 | 买卖合同纠纷 | 管城回族区 | 0.0 | 2014 | 1 | 22 | 赵鹏 | 郑州润瑞商业有限公司 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 51942 | 5ed58d8b3b3e6922c62a3097 | 2020-05-13 | TRUE | 大石桥市人民法院 | 一审 | 马学烈 | 25.0 | 600.00 | 辽宁省 | 营口市 | 产品责任纠纷 | 大石桥市 | 0.0 | 2020 | 5 | 13 | 胡奎镇 | 大石桥市丽荣茶行1 |
| 51947 | 5ed55b553b3e6922c624d4bc | 2020-05-22 | TRUE | 湖州市吴兴区人民法院 | 一审 | 吴潇敏 | 25.0 | 6.50 | 浙江省 | 湖州市 | 产品责任纠纷 | 吴兴区 | 0.0 | 2020 | 5 | 22 | 韦春辉 | 杭州肯德基有限公司湖州万达广场分店 |
| 51953 | 5ee734b748198c22518bc693 | 2020-05-15 | FALSE | 金堂县人民法院 | 一审 | 王泽波 | 515.0 | 0.00 | 四川省 | 成都市 | 产品责任纠纷 | 金堂县 | 0.0 | 2020 | 5 | 15 | 宋浩东 | 闻书敏 |
| 51954 | 5ee734b748198c22518bc693 | 2020-05-15 | FALSE | 金堂县人民法院 | 一审 | 王泽波 | 515.0 | 0.00 | 四川省 | 成都市 | 产品责任纠纷 | 金堂县 | 0.0 | 2020 | 5 | 15 | 宋浩东 | 江西佰灵实业有限公司 |
| 51955 | 5edfc5588aaaa011b71fa98b | 2020-06-03 | TRUE | 江苏省连云港市海州区人民法院 | 一审 | 张海 | 1233.0 | 5212.23 | 江苏省 | 连云港市 | 网络购物合同纠纷 | 海州区 | 0.0 | 2020 | 6 | 3 | 胡明利 | 杭州乔汇贸易有限公司 |
12430 rows × 18 columns
fig = px.box(
time_pd_penalty,
x = "year",
y = "penalty",
title = "Box Plot of All Cases Penalties"
)
fig.update_yaxes(type="log")
fig.show()
# 标的小的赢的概率和标的大的赢的概率会不会有差别
# 赢的种类:只有货价,货价和惩罚金都有,但是罚金也有大于1000和小于1000的区别
# temp = pd_cases[pd_cases['is_success']=="TRUE"].fillna(0)
# 违法原因,标的大小和惩罚金1000左右的案件的关系
# 把案件文本的案件事实做个分词统计高频词,用来整理违法原因做进一步分析
location_pd_penalty = pd_cases[pd_cases['is_success']=="TRUE"].replace(0, pd.np.nan).dropna(subset=['penalty']).groupby("province")['penalty'].mean().reset_index()
<ipython-input-109-ebf5f5906860>:1: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead
geo_fig = px.choropleth_mapbox(
data_frame = location_pd_penalty,
geojson = provinces_map,
# color = np.log10(location_pd_penalty['penalty']),
color = 'penalty',
locations = "province",
featureidkey = "properties.NL_NAME_1",
color_continuous_scale = px.colors.sequential.Magenta,
center = {"lat": 37.110573, "lon": 106.493924},
zoom = 3,
hover_data = ['penalty'],
title = "Geographical Distribution of Case Penalties"
)
geo_fig.show()
defendant_type = ['超市|食品|商场|商业|百货|商贸|经销|日用', '网络|电商|科技|商城|电子|云', \
'饭店|小吃|餐饮|餐厅|酒店', '药', '烟酒|散酒', '消费者|委员会|协会']
def classify_defendant(row_in_pd):
defendant = row_in_pd['defendant']
row_in_pd['defendant_type'] = "其他"
for i in defendant_type:
temp_types = i.split("|")
if any([s in str(defendant) for s in temp_types]):
row_in_pd['defendant_type'] = i
return row_in_pd
all_cases = all_cases.apply(lambda row: classify_defendant(row), axis=1)
all_cases.groupby("defendant_type")['case_id'].nunique().reset_index()
| defendant_type | case_id | |
|---|---|---|
| 0 | 其他 | 10298 |
| 1 | 消费者|委员会|协会 | 2 |
| 2 | 烟酒|散酒 | 112 |
| 3 | 网络|电商|科技|商城|电子|云 | 3529 |
| 4 | 药 | 1677 |
| 5 | 超市|食品|商场|商业|百货|商贸|经销|日用 | 25497 |
| 6 | 饭店|小吃|餐饮|餐厅|酒店 | 1144 |
fig = px.bar(
all_cases.groupby("defendant_type")['case_id'].nunique().reset_index(),
x = "defendant_type",
y = "case_id",
title = "The Distribution of Defendant Type"
)
fig.show()
repeat_plaintiffs = list(df_plaintiff_degrees[df_plaintiff_degrees['degree'] > 1]['plaintiff'])
unique_plaintiffs = list(df_plaintiff_degrees[df_plaintiff_degrees['degree'] == 1]['plaintiff'])
len(repeat_plaintiffs), len(unique_plaintiffs)
# print(unique_plaintiffs)
# strange names: #
# ')诉', ')', '(',
# '章小军到庭参加诉讼', '李海林本人到庭', '杨锦宏到庭参加了诉讼',
# '黄远到庭参加诉讼', '张细晶丈夫)', '代国海(以下简称', '何红国到庭参加诉讼'
# strange_names = ['马艳楠之夫)', '胡洪亮(', '黄小华男', '所在村委会推荐', '战伟东(公民',
# ')李静华', '所在社区推荐公民)', '李书臣(', '刘博逊(公民', '到庭参加了诉讼;', '吴桐、',
# '吴海之兄', '均', '王福松哥哥', '熊佳丽之夫)', ')与被告黎月琴(以下简称', '佟立伟(公民',
# '王飞(', '尚庆风之弟)', '李康与被告周青年买卖合同纠纷一案', '邵天玉到庭参加了诉讼', '孙海波(公民',
# '黄海禹与被告龚光敏网络购物合同纠纷一案', '何红国与被告黄柳絮网络购物合同纠纷一案', '蒋作飞之弟',
# '谢秀君(以下简称', ')与被告郭园园(以下简称', '人(原审', '钟若萍之丈夫)', '姬秀珍之子)', '刘娜姝的配偶)',
# '魏培永之妻)', '刘进(同时受王月芹特别授权委托', '刘述强到庭参加诉讼', '梁国树到庭参加诉讼',
# '王连龙(公民', '张燕(陈强妻子)', '黄远与被告袁永网络购物合同纠纷一案', '(一审', '潘正清(同时受黄文雅的特别授权委托',
# ')美可高特羊乳有限公司[', '回登梅与被告天津市人人乐商业有限公司产品销售者责任纠纷一案', '王甲(同时受陶甲的授权委托',
# '刘述强到庭参加了诉讼', ')与被告黄祖成(以下简称', '何红国与被告孙艳华网络购物合同纠纷一案', '于中秋(公民',
# '陈天清之夫', '金鹏之子)', ')陆荣辉', '刘珍到庭参加了诉讼', '苏安湛诉', '黄海禹到庭参加诉讼', '儿子)',
# '张锦连之女)', '杨锦宏与被告王清网络购物合同纠纷一案', '王伟之夫)', '邓某(', '黄海禹与被告马绍珍、浙江淘宝网络有限公司网络购物合同纠纷一案',
# '陈友元妹妹', '万幸乐(公民', '冯茂林(', '配偶)', '回登莲之夫)', '李燕之夫)', '张军凯因买卖合同纠纷一案', '苏安湛到庭参加诉讼',
# '价款十倍赔偿金32000元;2、由', '李康到庭参加诉讼', '李锐与被告倪丽平产品责任纠纷一案', '认为涉案产品存在的问题',
# '深圳市都市贝贝母婴用品有限公司的职工',
# '杨芳因与被告张波网络购物合同纠纷一案', '霍金凤之夫)', '聂长荣之子)', 'on',
# '姚文严诉', '所在社区推荐的代理人', '王分分诉', '任满仓、', '谢志桂本人到庭',
# '徐金兰之子)', '代国海与被告葛兰杰网络购物合同纠纷一案', '张芳君之子', '张浩铭(公民', '智海燕男',
# '陈勇萍的妻子)', '李锐与被告陈媛媛产品责任纠纷一案', '广西众生堂医药有限公司的共同委托诉讼代理人',
# '广西众生堂医药有限公司的共同委托代理人', '到庭参加诉讼', '衡庆因与', '青海览进生物科技开发有限公司、洛阳雨东电子商务有限公司的',
# '何旺来到庭参加了诉讼', '阳秋旺到庭', '胡一定本人到庭', '刘俊梅之夫', '赵甲.', '贺璟;', '李俊华本人到庭',
# '深圳市都市贝贝母婴用品有限公司的员工', '韦恒本人到庭', '陈洪东本人到庭',
# '到庭', '任满仓到庭参加诉讼', '郭勇到庭参加诉讼', '杨锦宏到庭参加诉讼', '代国海到庭参加了诉讼',
# '许承凯到庭', '许承凯本人到庭', '何旺来诉', '任满仓诉', '郭勇诉', '杨锦宏诉', '代国海诉']
(3305, 2357)
'''
关于重复原告关联案件的定义:
1. 一个案件只有一个原告,如果该原告属于重复原告,则这个案件就被划分为和重复原告相关的案件
2. 一个案件有两个或以上原告,只要有原告是重复原告,则该案件被划分为和重复原告相关的案件
3. 一个案件里不论有多少个原告,如果所有原告都不是重复原告,则该案件被划分为和非重复原告相关的案件
'''
temp = all_cases.groupby('case_id')['plaintiff'].apply(set).reset_index()
temp.head()
| case_id | plaintiff | |
|---|---|---|
| 0 | 57a5e43ec2265c04d1d8f3de | {周悟权} |
| 1 | 57a5e626c2265c04d1d9044b | {周悟权} |
| 2 | 57a5e6b3c2265c28a5302df3 | {包鹏} |
| 3 | 57a5e785c2265c2589531480 | {殷程} |
| 4 | 57a5e8f3c2265c04d1d91c99 | {王娴} |
def list_isn_list(row):
row['is_repeat_plaintiff_case'] = False
for i in row['plaintiff']:
if i in repeat_plaintiffs:
row['is_repeat_plaintiff_case'] = True
return row
return row
temp = temp.apply(lambda row: list_isn_list(row), axis=1)
temp
| case_id | plaintiff | is_repeat_plaintiff_case | |
|---|---|---|---|
| 0 | 57a5e43ec2265c04d1d8f3de | {周悟权} | True |
| 1 | 57a5e626c2265c04d1d9044b | {周悟权} | True |
| 2 | 57a5e6b3c2265c28a5302df3 | {包鹏} | True |
| 3 | 57a5e785c2265c2589531480 | {殷程} | True |
| 4 | 57a5e8f3c2265c04d1d91c99 | {王娴} | True |
| ... | ... | ... | ... |
| 39252 | 5fc0746e67ec044cbc093333 | {沈郊东} | False |
| 39253 | 5fc079a767ec044cbc26336c | {陈天清} | True |
| 39254 | 5fe2d1c1b221d546ac20d2a9 | {王福松} | True |
| 39255 | 6000f1eeec7d0c16ecfe3f5e | {李健} | True |
| 39256 | 6000fa3fec7d0c16ec0e11e0 | {张杰} | True |
39257 rows × 3 columns
temp[temp['is_repeat_plaintiff_case']==False].info(), temp[temp['is_repeat_plaintiff_case']==True].info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2524 entries, 14 to 39252 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 case_id 2524 non-null object 1 plaintiff 2524 non-null object 2 is_repeat_plaintiff_case 2524 non-null bool dtypes: bool(1), object(2) memory usage: 61.6+ KB <class 'pandas.core.frame.DataFrame'> Int64Index: 36733 entries, 0 to 39256 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 case_id 36733 non-null object 1 plaintiff 36733 non-null object 2 is_repeat_plaintiff_case 36733 non-null bool dtypes: bool(1), object(2) memory usage: 896.8+ KB
(None, None)
# 36811 + 2619, 39430, 2627+36803
39257, 2524+36733
(39257, 39257)
# 去掉干扰原告名称的案件后
5662-3305, 39257-36733, 2524-2357
(2357, 2524, 167)
唯一原告仍存在167的误差
误差原因:
all_cases[all_cases['defendant_type']=="药"]['case_id'].drop_duplicates().to_csv("/Users/starice/Desktop/cases_deftype_medicine.csv")
#获取所有一审案件
first_cases = all_cases[all_cases['procedure']=="一审"]
print("一审案件数量: ", len(first_cases['case_id'].drop_duplicates()))
# 获取所有案件原告的节点度并排序
degree_1stplaintiffs = first_cases.groupby("plaintiff")['case_id'].unique().reset_index()
degree_1stplaintiffs['case_count'] = degree_1stplaintiffs['case_id'].apply(lambda r: len(r))
degree_1stplaintiffs.sort_values(by="case_count", inplace=True, ascending=False)
new_selected_1stcp_list = []
for i in range(50, 550, 50):
new_selected_1stcp_list.append(first_cases[first_cases['plaintiff'].isin(degree_1stplaintiffs[0:i]['plaintiff'])])
# fps_50, fps_100, fps_150, fps_200, fps_250 = degree_1stplaintiffs[:50], degree_1stplaintiffs[50:100], degree_1stplaintiffs[100:150], degree_1stplaintiffs[150:200], degree_1stplaintiffs[200:250]
一审案件数量: 32445
df_chongqing_case = pd.DataFrame(columns=['num', 'ccase_count', 'case_count'])
n = 50
for i in new_selected_1stcp_list:
df_chongqing_case = df_chongqing_case.append({"num": n, \
"ccase_count": len(i[i['province']=="重庆市"]['case_id'].drop_duplicates()), \
"case_count": len(i['case_id'].drop_duplicates())}, \
ignore_index=True)
n += 50
df_chongqing_case['chongqing_case_rate'] = df_chongqing_case['ccase_count'] / df_chongqing_case['case_count']
df_chongqing_case.head()
| num | ccase_count | case_count | chongqing_case_rate | |
|---|---|---|---|---|
| 0 | 50 | 5467 | 9383 | 0.582649 |
| 1 | 100 | 6033 | 12323 | 0.489572 |
| 2 | 150 | 6471 | 14413 | 0.44897 |
| 3 | 200 | 6800 | 16084 | 0.42278 |
| 4 | 250 | 6912 | 17405 | 0.397127 |
fig = px.bar(
df_chongqing_case,
x = "num",
y = "chongqing_case_rate",
hover_data = ['ccase_count', 'case_count']
)
fig.show()
# 可以考虑比较一下重庆和其他城市的情况,因为重庆的案件占比很大
# 原告也大多都是重庆的吗?(是的)
import statsmodels.api as sm
import statsmodels.formula.api as smf
def generate_nmonths_cc(row, num, df):
start = row['date']
end = start + pd.DateOffset(months=num)
row['cc_' + str(num) + "m_after"] = df[(df['date']>=start) & (df['date']<end)]['case_count'].sum()
return row
def sr_cc_analysis(province, sr_months, cc_months):
temp_cases = pd_1stcases[pd_1stcases['province']==province]
time_nselected_1stcp_cc = temp_cases.groupby(["province", "year", "month"])['case_id'].nunique().reset_index()
time_nselected_1stcp_sc = temp_cases[temp_cases['is_success']=="TRUE"].groupby(["year", "month", "province"])['case_id'].nunique().reset_index()
time_nselected_1stcp_cc.rename(columns={"case_id": "case_count"}, inplace=True)
time_nselected_1stcp_sc.rename(columns={"case_id": "success_count"}, inplace=True)
time_nselected_1stcp = time_nselected_1stcp_cc.merge(time_nselected_1stcp_sc, how="left")
time_nselected_1stcp.fillna(0, inplace=True)
time_nselected_1stcp = time_nselected_1stcp.sort_values(by=["province", "year", "month"])
time_nselected_1stcp['time'] = time_nselected_1stcp[['year', 'month', 'province']].apply(lambda r: str(r['year']) + "-" + str(r['month']), axis=1)
dates = pd.to_datetime(time_nselected_1stcp.time, format='%Y-%m')
time_nselected_1stcp['date'] = dates
freq = str(sr_months) + 'MS'
df_province = (time_nselected_1stcp
.assign(date=dates)
.groupby(['province', 'year', pd.Grouper(key='date', freq=freq)])
.sum()
.reset_index())
df_province['success_rate_' + str(sr_months) + 'm'] = df_province['success_count'] / df_province['case_count']
time_nselected_1stcp = time_nselected_1stcp.apply(lambda row: generate_nmonths_cc(row, cc_months, time_nselected_1stcp), axis=1)
final_df_province = df_province.merge(time_nselected_1stcp, on=['province', 'year', 'date'], how="left")
final_df_province['cc_' + str(cc_months) + "m_after"] = final_df_province['cc_' + str(cc_months) + "m_after"].shift(-1)
final_df_province.rename(columns={"case_count_x": "case_count_" + str(sr_months) + "m", \
"success_count_x": "success_count_" + str(sr_months) + "m"}, inplace=True)
final_df_province.dropna()
# regression part
mod = smf.ols(str(str('cc_' + str(cc_months) + 'm_after') + '~' + str('success_rate_' + str(sr_months) + 'm')), \
data=final_df_province)
model = mod.fit() # 构建最小二乘模型并拟合
print(model.summary()) # 输出回归结果
final_df_province[str('predict_cc_' + str(cc_months) + 'm_after')] = \
model.params[0] + model.params[1] * final_df_province[str('success_rate_' + str(sr_months) + 'm')]
fig = px.line(
final_df_province,
x = 'date',
y = [
str('success_rate_' + str(sr_months) + 'm'),
str('cc_' + str(cc_months) + 'm_after'),
str('predict_cc_' + str(cc_months) + 'm_after')
]
)
fig.update_yaxes(type="log")
fig.update_layout(width=1000, height=500)
fig.show()
return final_df_province
final_df_province = sr_cc_analysis("重庆市", 3, 2)#表示重庆市前三个月赢率和后三个月案件数的关联
OLS Regression Results
==============================================================================
Dep. Variable: cc_2m_after R-squared: 0.000
Model: OLS Adj. R-squared: -0.043
Method: Least Squares F-statistic: 0.004848
Date: Fri, 04 Jun 2021 Prob (F-statistic): 0.945
Time: 11:33:11 Log-Likelihood: -170.30
No. Observations: 25 AIC: 344.6
Df Residuals: 23 BIC: 347.0
Df Model: 1
Covariance Type: nonrobust
===================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------
Intercept 137.0790 491.598 0.279 0.783 -879.868 1154.026
success_rate_3m 37.9467 544.973 0.070 0.945 -1089.416 1165.309
==============================================================================
Omnibus: 13.101 Durbin-Watson: 1.233
Prob(Omnibus): 0.001 Jarque-Bera (JB): 11.979
Skew: 1.554 Prob(JB): 0.00250
Kurtosis: 4.357 Cond. No. 21.5
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.